In [1]:
import pandas as pd
In [2]:
df = pd.read_csv("url_expanded.full.txt", sep="\t", header=None)
df.shape
Out[2]:
(97512, 3)
In [3]:
df.head()
Out[3]:
0
1
2
0
http://www.investmentnews.com/article/20160801...
http://www.investmentnews.com/article/20160801...
0
1
http://ow.ly/3avNPe
https://www.reddit.com/r/cahideas/comments/42i...
0
2
http://stratcom.kma-assc.com/uncategorized/pre...
http://stratcom.kma-assc.com/uncategorized/pre...
3
3
http://ln.is/mabelsaveforschool.com/gbEtv
http://linkis.com/mabelsaveforschool.com/gbEtv
0
4
http://kiw.im/16LfJirkfzE
https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927
0
In [4]:
df.columns = ["URL", "EXPANDED", "EXPANDED_STATUS"]
df.head()
Out[4]:
URL
EXPANDED
EXPANDED_STATUS
0
http://www.investmentnews.com/article/20160801...
http://www.investmentnews.com/article/20160801...
0
1
http://ow.ly/3avNPe
https://www.reddit.com/r/cahideas/comments/42i...
0
2
http://stratcom.kma-assc.com/uncategorized/pre...
http://stratcom.kma-assc.com/uncategorized/pre...
3
3
http://ln.is/mabelsaveforschool.com/gbEtv
http://linkis.com/mabelsaveforschool.com/gbEtv
0
4
http://kiw.im/16LfJirkfzE
https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927
0
In [5]:
df.EXPANDED_STATUS.value_counts()
Out[5]:
0 92362
1 3651
3 1489
2 10
Name: EXPANDED_STATUS, dtype: int64
In [6]:
df[df.EXPANDED_STATUS == 1].head()
Out[6]:
URL
EXPANDED
EXPANDED_STATUS
14
http://dailydose.topratedviral.com/article/wom...
http://dailydose.topratedviral.com/article/wom...
1
15
http://gvwy.io/v9h3w9l
http://mabelsaveforschool.com/contest-entry
1
23
http://s.einnews.com/tGmrKnfQ1C
http://s.einnews.com/tGmrKnfQ1C
1
30
http://gvwy.io/lygewah
http://mabelsaveforschool.com/contest-entry
1
59
http://gvwy.io/3ogfrpp
http://mabelsaveforschool.com/contest-entry
1
In [7]:
df[df.EXPANDED_STATUS == 3].head(100)
Out[7]:
URL
EXPANDED
EXPANDED_STATUS
2
http://stratcom.kma-assc.com/uncategorized/pre...
http://stratcom.kma-assc.com/uncategorized/pre...
3
27
http://dlvr.it/KxCjYs
http://post/142016360553?utm_source=dlvr.it&ut...
3
64
http://soco.space/-m0zuC
http://soco.space/-m0zuC
3
120
http://seusnews.com/?p=1756
http://seusnews.com/?p=1756
3
145
http://deals.buycheap2day.com/US/lndng-st/twt/...
http://deals.buycheap2day.com/US/lndng-st/twt/...
3
240
http://sociably.me/L0pQLX
http://sociably.me/L0pQLX
3
257
http://j.mp/1Zyj2Lp
http://feeds.huffingtonpost.com/c/35496/f/6770...
3
536
https://videotube.livehost.fr/2016/11/28/learn...
https://videotube.livehost.fr/2016/11/28/learn...
3
548
http://dlvr.it/Kv4tfx
http://vulture.feedsportal.com/c/35348/f/66160...
3
560
http://www.fashionisme.us/2013/07/useful-foods...
http://www.fashionisme.us/2013/07/useful-foods...
3
755
http://www.relevantmagazine.com/reject-apathy/...
https://relevantmagazine.com//reject-apathy/vi...
3
829
http://dlvr.it/L1ZVtd
http://reuters.us.feedsportal.com/c/35217/f/65...
3
906
http://Blaylock.Dr.Group
http://Blaylock.Dr.Group/
3
1240
http://reference-and-education.journaleus.com/...
http://reference-and-education.journaleus.com/...
3
1335
http://bit.ly/1UFhfkJ
http://reuters.us.feedsportal.com/c/35217/f/65...
3
1365
http://observer.gm/vaccination-campaign-agains...
http://observer.gm/vaccination-campaign-agains...
3
1455
http://www.Feed24hNews.com/7q9It
http://www.Feed24hNews.com/7q9It
3
1582
https://shar.es/1CMQ2r
http://www.authornicolewalker.com/raising_auti...
3
1626
http://inbrief.media/2016/01/20/when-can-the-f...
http://inbrief.media/2016/01/20/when-can-the-f...
3
1710
http://goo.gl/y7FzIj
http://www.dainikbhaskar.tv/apple/aclu-other-p...
3
1769
http://roots.ly/lHTptg
https://roots.ly/lHTptg
3
1836
http://dlvr.it/MLBH4l
http://master-of-education.goitstar.com/post/1...
3
1894
http://dlvr.it/MDn1j4
http://keepliberty.org/2016/09/10/us-has-spent...
3
1919
http://bit.ly/ONymEF
http://www.longbeachnsw.com.au/
3
1953
http://fb.me/2S1kcWQTE
http://soco.space/qlXx9e
3
2089
http://ow.ly/3bs6KP
http://reuters.us.feedsportal.com/c/35217/f/65...
3
2256
http://bit.ly/21Ootr4
http://cnet.com.feedsportal.com/c/34938/f/6450...
3
2432
http://fb.me/3k28jNlpW
http://personalhealthdiary.co/fda-announce-tha...
3
2438
http://fb.me/3RERphXAT
http://personalhealthdiary.co/fda-announce-tha...
3
2582
http://goo.gl/fb/bpUe5M
http://post/138982303064?utm_source=feedburner...
3
...
...
...
...
4974
http://ow.ly/3a5TfG
http://viralinstant.com/no-adverse-side-effect...
3
4999
http://bit.ly/1TZDQdy
http://cnet.com.feedsportal.com/c/34938/f/6450...
3
5037
http://dlvr.it/KkRMV8
http://www.ynn.io/latest-best-sports-news/garb...
3
5229
http://bit.ly/1Y7rkt2
http://israelnewsreport.net/open-letter-on-don...
3
5249
http://dld.bz/erX9w
http://reuters.us.feedsportal.com/c/35217/f/65...
3
5295
http://salon.com.feedsportal.com/c/35105/f/648...
http://salon.com.feedsportal.com/c/35105/f/648...
3
5343
http://dlvr.it/L32mDB
http://rss.feedsportal.com/c/34793/f/641580/s/...
3
5370
http://soco.space/cM3jRw
http://soco.space/cM3jRw
3
5372
http://dlvr.it/LLBSXN
http://can10.jkmesh.com/post/144533385307?utm_...
3
5459
http://dlvr.it/DJy62M
http://www.techwens.com/blackberry-refutes-cla...
3
5460
http://smartbetty.me/1QT1SWV
http://smartbetty.me/1QT1SWV
3
5544
http://www.sciencedirect.com.libproxy-wb.imf.o...
http://www.sciencedirect.com.libproxy-wb.imf.o...
3
5584
http://ift.tt/28uIJ5f
http://bactema.com/orlando-terror-focus-alread...
3
5597
http://bit.ly/1VEZNzx
http://reuters.us.feedsportal.com/c/35217/f/65...
3
5739
http://live.americarisingpac.org/posts/212
http://live.americarisingpac.org/posts/212
3
5763
https://www.ataxia.org/pdf/SporadicAtaxias.pdf
https://www.ataxia.org/pdf/SporadicAtaxias.pdf
3
5831
http://bit.ly/1PCir7g
http://zdnet.com.feedsportal.com/c/35462/f/675...
3
5910
http://www.wholechildeducation.org/blog/build-...
http://www.wholechildeducation.org/blog/build-...
3
5993
http://www.alltriberr.com/poll-finds-national-...
http://www.alltriberr.com/poll-finds-national-...
3
6026
http://TheBlogToday.co.vu/q0TRBI
http://TheBlogToday.co.vu/q0TRBI
3
6043
http://goo.gl/Hd5WtH
http://casyope.info/5440903-13803496
3
6074
http://bit.ly/1ITFKdd
http://reuters.us.feedsportal.com/c/35217/f/65...
3
6085
http://dlvr.it/DFWbxR
http://zerohedge.feedsportal.com/c/34894/f/645...
3
6089
http://bit.ly/2aqzM87
http://beautifulhairstyles.14p.in/block-the-su...
3
6093
http://nyv.me/l/LJkg
http://nyv.me/l/LJkg
3
6186
http://ift.tt/1Yu6Pq0
http://nydailynews.com.feedsportal.com/c/34148...
3
6239
http://huff.to/1PBu5NG
http://feeds.huffingtonpost.com/c/35496/f/6770...
3
6242
http://www.PittsburghFor.me
http://www.PittsburghFor.me/
3
6250
http://bit.ly/RentShelby
http://ourridelife.com/2016/03/23/you-can-rent...
3
6363
http://dlvr.it/L1bvpM
http://reuters.us.feedsportal.com/c/35217/f/65...
3
100 rows × 3 columns
In [8]:
df[df.EXPANDED_STATUS == 3].EXPANDED.str.split("/").apply(lambda x: x[2]).value_counts()
Out[8]:
reuters.us.feedsportal.com 143
feeds.huffingtonpost.com 92
soco.space 63
personalhealthdiary.co 53
www.ynn.io 45
rss.feedsportal.com 30
post 19
l.herald.ly 17
www.trendgizmo.com 17
cnet.com.feedsportal.com 16
zerohedge.feedsportal.com 15
pumpkin-dukan-diet.7legend.net 15
zdnet.com.feedsportal.com 15
healthlogics.press 15
nydailynews.com.feedsportal.com 14
appleinsider.com.feedsportal.com 14
dailyeeuu.tusueldo.com 13
politics.tusueldo.com 13
telegraph.feedsportal.com 13
www.techwens.com 12
advertising-education.live-newsx.com 12
www.youthsnews.com 12
master-of-education.goitstar.com 12
stratcom.kma-assc.com 12
ndtv.com.feedsportal.com 11
www.dainikbhaskar.tv 11
100-singalong-songs-for-kids.goitstar.com 10
sociably.me 10
TheBlogToday.co.vu 10
gbr.jkmesh.com 10
...
overdrive.ae 1
basketballfanzone.org 1
csnn.gov 1
mrtopstep.com. 1
nipple-huggers.com 1
www.peerlyst.com 1
messagwww.mharrell3.myrandf.biz 1
www.acscva.com 1
israelnewsreport.net 1
newsinkansas.ml 1
acclaimcollegecounseling.com 1
www.HoustonFor.me 1
got-tlc.info 1
amazinggiftsforall.top 1
carsautosglobal.com 1
www.highplainsdailynews.com 1
www.houstonlocal.news 1
donaldtrumpreviews.com 1
www.usworldreport.com 1
lfger.com 1
albania.jobs.forjobsearch.com 1
forum.theworldnewsmedia.org 1
suavebaes.com 1
www.ihealthbeat.org 1
bestoflisticles.com 1
jobsearch.com.de 1
inbrief.media 1
1.black 1
roots.ly 1
rockymountainrv.com 1
Name: EXPANDED, dtype: int64
In [9]:
df[(df.EXPANDED_STATUS == 3) & (df.EXPANDED.str.split("/").apply(lambda x: x[2]) == "www.huffingtonpost.com")].head()
Out[9]:
URL
EXPANDED
EXPANDED_STATUS
In [10]:
df_err = pd.read_csv("url_expanded.error.1.txt", sep="\t", header=None)
df_err.shape
Out[10]:
(1489, 3)
In [11]:
df_err.columns = ["URL", "EXPANDED", "EXPANDED_STATUS"]
df_err.head()
Out[11]:
URL
EXPANDED
EXPANDED_STATUS
0
http://ift.tt/1mBLaPF
http://reuters.us.feedsportal.com/c/35217/f/65...
3
1
http://logs.wsj.com/pharmalot/2015/06/08/merck...
http://logs.wsj.com/pharmalot/2015/06/08/merck...
3
2
http://bit.ly/1oRL1bE
http://rss.feedsportal.com/c/34793/f/641580/s/...
3
3
http://stratcom.kma-assc.com/uncategorized/pre...
http://stratcom.kma-assc.com/uncategorized/pre...
3
4
http://americagunban.com/moscow-says-usa-actio...
http://americagunban.com/moscow-says-usa-actio...
0
In [12]:
df_err.EXPANDED_STATUS.value_counts()
Out[12]:
3 1396
0 71
1 22
Name: EXPANDED_STATUS, dtype: int64
In [13]:
df_err[df_err.EXPANDED_STATUS == 3].EXPANDED.str.split("/").apply(lambda x: x[2]).value_counts()
Out[13]:
reuters.us.feedsportal.com 143
feeds.huffingtonpost.com 92
soco.space 63
personalhealthdiary.co 53
www.ynn.io 45
rss.feedsportal.com 30
l.herald.ly 17
www.trendgizmo.com 17
cnet.com.feedsportal.com 16
zdnet.com.feedsportal.com 15
pumpkin-dukan-diet.7legend.net 15
zerohedge.feedsportal.com 15
appleinsider.com.feedsportal.com 14
nydailynews.com.feedsportal.com 14
politics.tusueldo.com 13
telegraph.feedsportal.com 13
dailyeeuu.tusueldo.com 13
advertising-education.live-newsx.com 12
master-of-education.goitstar.com 12
www.techwens.com 12
www.youthsnews.com 12
stratcom.kma-assc.com 12
www.dainikbhaskar.tv 11
ndtv.com.feedsportal.com 11
sociably.me 10
TheBlogToday.co.vu 10
gbr.jkmesh.com 10
dailyhobbies.net 10
100-singalong-songs-for-kids.goitstar.com 10
nu.hackn.us 9
...
www.samgotechnology.com 1
Benghazi.You 1
www.meltsalad.com 1
livehealthy-team.com 1
techdaily.xyz 1
post 1
furbabypetpalace.com 1
www.gadgets-4g.com 1
cybermick.com 1
amazinggiftsforall.top 1
COUNTRY.How 1
newsinkansas.ml 1
www.houstonlocal.news 1
lfger.com 1
albania.jobs.forjobsearch.com 1
forum.theworldnewsmedia.org 1
suavebaes.com 1
crenshaw.house.gov 1
observer.gm 1
bestoflisticles.com 1
Stranger.com 1
jobsearch.com.de 1
orlando 1
1.black 1
carsautosglobal.com 1
www.acscva.com 1
www.lidoautobody.com 1
messagwww.mharrell3.myrandf.biz 1
www.thechildcaresquare.com 1
rockymountainrv.com 1
Name: EXPANDED, dtype: int64
In [14]:
df = df.set_index("URL")
df_err = df_err.set_index("URL")
df.shape, df_err.shape
Out[14]:
((97512, 2), (1489, 2))
In [15]:
df.head()
Out[15]:
EXPANDED
EXPANDED_STATUS
URL
http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike
http://www.investmentnews.com/article/20160801...
0
http://ow.ly/3avNPe
https://www.reddit.com/r/cahideas/comments/42i...
0
http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/
http://stratcom.kma-assc.com/uncategorized/pre...
3
http://ln.is/mabelsaveforschool.com/gbEtv
http://linkis.com/mabelsaveforschool.com/gbEtv
0
http://kiw.im/16LfJirkfzE
https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927
0
In [16]:
df.ix[df_err.index, ["EXPANDED", "EXPANDED_STATUS"]] = df_err[["EXPANDED", "EXPANDED_STATUS"]]
In [17]:
df.ix[df_err.index]["EXPANDED_STATUS"].value_counts()
Out[17]:
3 1396
0 71
1 22
Name: EXPANDED_STATUS, dtype: int64
In [18]:
df.to_csv("url_expanded.merged.txt", sep="\t")
! head url_expanded.merged.txt
URL EXPANDED EXPANDED_STATUS
http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike 0
http://ow.ly/3avNPe https://www.reddit.com/r/cahideas/comments/42i3ew/w_farting_mid_rimjob/ 0
http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/ http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/ 3
http://ln.is/mabelsaveforschool.com/gbEtv http://linkis.com/mabelsaveforschool.com/gbEtv 0
http://kiw.im/16LfJirkfzE https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927 0
http://fb.me/241s7UtEJ https://www.facebook.com/story.php?story_fbid=1251035921618693&id=100001368900242 0
http://owl.li/XkyUO https://www.youtube.com/watch?v=xtspq5T7B44&feature=em-uploademail 0
http://goo.gl/RTQ29 http://localbuzznetwork.com/clarksburg-wv-job-search/ 0
http://buff.ly/1SNoZU6 http://weightlosslaw.com/01cdea672dbfe8?utm_content=bufferb9ed1&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer 0
In [ ]:
Content source: napsternxg/ControversialTweetAnalysis
Similar notebooks: